seed = 65
set.seed(seed)
#print(as.matrix(colnames(big_df)))
col_id <- grep('[(]v[)]', colnames(big_df))
colnames(big_df) <- gsub('[(]v[)]', '', colnames(big_df))
smpl <- denoisingCTF::t_asinh(big_df)
# UMAP
# umap_smp <- umap::umap(smpl[,col_id])
# umap_smp <- as.data.frame(cbind(smpl,
# UMAP1 = umap_smp$layout[,1],
# UMAP2 = umap_smp$layout[,2]))
# save(umap_smp, file = "UMAPcelllines_paper2020.RData")
load("~/Documents/Massion_lab/manuscripts/prelim_cytof_adc/data/cell_lines/processed_files/mix/UMAPcelllines_paper2020.RData")
## TableGrob (4 x 3) "arrange": 12 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (2-2,1-1) arrange gtable[layout]
## 5 5 (2-2,2-2) arrange gtable[layout]
## 6 6 (2-2,3-3) arrange gtable[layout]
## 7 7 (3-3,1-1) arrange gtable[layout]
## 8 8 (3-3,2-2) arrange gtable[layout]
## 9 9 (3-3,3-3) arrange gtable[layout]
## 10 10 (4-4,1-1) arrange gtable[layout]
## 11 11 (4-4,2-2) arrange gtable[layout]
## 12 12 (4-4,3-3) arrange gtable[layout]
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Fig 2. Clustering cell lines
# find optimal number of clusters (elbow plot)
# DetermineNumberOfClusters(smpl[,col_id],k_max=45,plot=T,smooth=0.2, iter.max=50, seed = 45,
# ask_ft = F,arcsn_tr = F)
# k-means clustering
set.seed(45)
cl_data <- stats::kmeans(smpl[,col_id], centers = 8, iter.max = 50)$cluster
df_cluster <- cbind(smpl[,col_id], 'cluster'= cl_data, "cl_ID"=smpl$cl_ID, umap_smp[c('UMAP1', 'UMAP2')])
# proyect clusters on UMAP
kl <- c("gray60", "#AAD7AC", "#F9C769", "steelblue1", "#B27CC1", "lightskyblue1", "#F2BFB9", "cornflowerblue")
ClusterUMAP_plot(df_cluster, color_by_cluster = T, color_by_protein = F, pallete = F,
cluster_col = 'cluster', plot_clusnames = T, plot_title = 'Cluster ID',
lg_names = levels(as.factor(df_cluster$cluster)),
x_lim = c(-14,11), y_lim = c(-10,14), plot_colors = kl)
# barplot
prcnt_by_cl <- ClassAbundanceByPt(data=df_cluster, ptID_col = 'cluster',
class_col = 'cl_ID')
prcnt_by_cl <- prcnt_by_cl[order(row.names(prcnt_by_cl)),]
#rownames(prcnt_by_cl) <- paste0(rownames(prcnt_by_pt), '_', ref$CANARY)
prcnt_by_cl <- prcnt_by_cl*100
ct <- c("#EF7E33", "#52A02E", "#D7392E", "#9467BD", "#2977B4")
bp_order <- c(4,8,6,1,5,7,2,3)
dendrogram_barplot(prcnt_by_cl, dist_method = 'euclidean',
hclust_method = 'complete', coul = ct, BPOrderAsDendrogram=F,
bp_order=bp_order, xlabl = "Cell type (% of cluster)" , ylabl = "Cluster ID",
cex_names = 1, cex_axis = 1.2,cex_lab = 0.5) #cex.sub and cex.lab do nothing
load("~/Documents/Massion_lab/manuscripts/prelim_cytof_adc/data/cell_lines/processed_files/cell_lines_replicates_gated/cell_lines_val_labeled.RData")
#print(as.matrix(colnames(big_df)))
col_id <- grep('[(]v[)]', colnames(big_df_ctl))
#x <- c(15, 37, 41, 38, 26, 18, 45, 29, 46, 34, 44, 47)
colnames(big_df_ctl) <- gsub('[(]v[)]', '', colnames(big_df_ctl))
df <- denoisingCTF::t_asinh(big_df_ctl[,col_id])
colnames(df) <- gsub(" ","", sapply(strsplit(as.character(colnames(df)), "_"), "[[", 2))
df <- cbind(df, cl_ID=big_df_ctl$cl_ID, sample_ID=big_df_ctl$sample_ID)
df <- melt(df)
## Using cl_ID, sample_ID as id variables
# A549
ggplot(df[which(df$cl_ID=='A549'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# H23
ggplot(df[which(df$cl_ID=='H23'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# H3122
ggplot(df[which(df$cl_ID=='H3122'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# Monocytes
ggplot(df[which(df$cl_ID=='Monocytes'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# PC9
ggplot(df[which(df$cl_ID=='PC9'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# Tc.cells
ggplot(df[which(df$cl_ID=='Tc.cells'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# Th.cells
ggplot(df[which(df$cl_ID=='Th.cells'),], aes(x=value, group=sample_ID, color=sample_ID)) +
geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")
# Changing Tc_cells and Th_cells
umap_smp$subtype <- as.character(umap_smp$subtype)
umap_smp$subtype4 <- as.character(umap_smp$subtype4)
k <- which(umap_smp$subtype =='Tc_cells')
umap_smp[k, 'subtype'] <- 'CD8T_cells'
umap_smp[k, 'subtype4'] <- 'CD8T_cells'
k <- which(umap_smp$subtype =='Th_cells')
umap_smp[k, 'subtype'] <- 'CD4T_cells'
umap_smp[k, 'subtype4'] <- 'CD4T_cells'
umap_smp$subtype <- as.factor(umap_smp$subtype)
umap_smp$subtype4 <- as.factor(umap_smp$subtype4)
umap_smp$subtype4 <- factor(umap_smp$subtype4, levels = c("Epithelial", "Endothelial", "Mesenchymal", "Other_immune", "Myeloid", "CD8T_cells", "CD4T_cells"))
umap_smp['subtype4'] <- as.character(umap_smp$subtype)
umap_smp[which(umap_smp$cell_type=='Epithelial'), 'subtype4'] <- 'ECC'
umap_smp$subtype4 <- as.factor(umap_smp$subtype4)
umap_smp$subtype4 <- factor(umap_smp$subtype4, levels = c("ECC", "Endothelial", "Mesenchymal", "Other_immune", "Myeloid", "CD8T_cells", "CD4T_cells"))
## TableGrob (2 x 6) "arrange": 12 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (1-1,4-4) arrange gtable[layout]
## 5 5 (1-1,5-5) arrange gtable[layout]
## 6 6 (1-1,6-6) arrange gtable[layout]
## 7 7 (2-2,1-1) arrange gtable[layout]
## 8 8 (2-2,2-2) arrange gtable[layout]
## 9 9 (2-2,3-3) arrange gtable[layout]
## 10 10 (2-2,4-4) arrange gtable[layout]
## 11 11 (2-2,5-5) arrange gtable[layout]
## 12 12 (2-2,6-6) arrange gtable[layout]
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
##
## Attaching package: 'operators'
## The following objects are masked from 'package:base':
##
## options, strrep
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:operators':
##
## %>%
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## cell_type p.value FDR_corrected
## 1 Myeloid 0.3523810 0.6166667
## 2 Other_immune 0.7619048 0.8888889
## 3 CD4T_cells 0.3523810 0.6166667
## 4 CD8T_cells 0.1714286 0.6000000
## 5 Epithelial 0.9142857 0.9142857
## 6 Mesenchymal 0.4761905 0.6666667
## 7 Endothelial 0.1714286 0.6000000
## Using pt_ID, CANARY as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(2.63407697361086, 0.0695405059165581,
## 2.61931335471328, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.347064923349515, 0.756454263937334,
## 0.461532710617599, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(6.85846430892733e-05, 0.176901427304811, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.604787413934418, 2.00996443199906,
## 0.22811706521791, : cannot compute exact p-value with ties
## Protein p.value FDR_corrected
## 1 141Pr_EpCAM 0.76190476 1.0000000
## 2 142Nd_ccasp3 1.00000000 1.0000000
## 3 144Nd_HLA-ABC 0.06910652 0.6095238
## 4 147Sm_b-CAT 0.10550173 0.6095238
## 5 151Eu_TTF1 0.22976627 0.9190651
## 6 154Sm_CD45 0.60952381 1.0000000
## 7 155Gd_CD56 1.00000000 1.0000000
## 8 156Gd_Vimentin 0.91428571 1.0000000
## 9 158Gd_p-STAT3 0.91428571 1.0000000
## 10 160Gd_MDM2 0.91428571 1.0000000
## 11 161Dy_Cytokeratin 0.91428571 1.0000000
## 12 163Dy_TP63 0.60952381 1.0000000
## 13 164Dy_CK7 1.00000000 1.0000000
## 14 166Er_CD44 0.47619048 1.0000000
## 15 170Yb_CD3 0.76190476 1.0000000
## 16 174Yb_HLA-DR 0.11428571 0.6095238
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Protein p.value FDR_corrected
## 1 144Nd_HLA-ABC 0.1714286 0.7619048
## 2 145Nd_CD31 0.4761905 0.7619048
## 3 146Nd_Thioredoxin 0.3523810 0.7619048
## 4 147Sm_b-CAT 0.4761905 0.7619048
## 5 156Gd_Vimentin 1.0000000 1.0000000
## 6 158Gd_p-STAT3 0.7619048 0.8465608
## 7 160Gd_MDM2 0.6095238 0.7619048
## 8 161Dy_Cytokeratin 0.4761905 0.7619048
## 9 163Dy_TP63 0.6095238 0.7619048
## 10 174Yb_HLA-DR 0.1714286 0.7619048
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(0, 0.0958118253303701, 0.284162729407862, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.802649816095183, 0.215492384630114,
## 0.0605647943338007, : cannot compute exact p-value with ties
## Protein p.value FDR_corrected
## 1 141Pr_EpCAM 0.76190476 0.9142857
## 2 142Nd_ccasp3 0.91428571 0.9142857
## 3 155Gd_CD56 0.47619048 0.8571429
## 4 156Gd_Vimentin 0.91428571 0.9142857
## 5 158Gd_p-STAT3 0.11428571 0.5142857
## 6 160Gd_MDM2 0.19945761 0.5983728
## 7 163Dy_TP63 0.47619048 0.8571429
## 8 170Yb_CD3 0.91406196 0.9142857
## 9 174Yb_HLA-DR 0.03809524 0.3428571
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(3.06708656196431, 0.066910551904235,
## 3.04431780411568, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.202586121632867, 0.0645635010399363, : cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(c(0.501727573634015, 0.130067040154415,
## 0.712432219756351, : cannot compute exact p-value with ties
## Protein p.value FDR_corrected
## 1 144Nd_HLA-ABC 0.1055017 0.8571429
## 2 154Sm_CD45 0.4761905 0.9523810
## 3 156Gd_Vimentin 0.4761905 0.9523810
## 4 158Gd_p-STAT3 0.7619048 0.9523810
## 5 159Tb_CD4 0.9140620 1.0000000
## 6 163Dy_TP63 0.6095238 0.9523810
## 7 166Er_CD44 1.0000000 1.0000000
## 8 170Yb_CD3 0.7619048 0.9523810
## 9 171Yb_CD11b 0.3314247 0.9523810
## 10 174Yb_HLA-DR 0.1714286 0.8571429
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Protein p.value FDR_corrected
## 1 144Nd_HLA-ABC 0.03174603 0.2539683
## 2 154Sm_CD45 0.90476190 0.9047619
## 3 156Gd_Vimentin 0.55555556 0.8344671
## 4 163Dy_TP63 0.19047619 0.5079365
## 5 166Er_CD44 0.73015873 0.8344671
## 6 168Er_CD8 0.55555556 0.8344671
## 7 170Yb_CD3 0.73015873 0.8344671
## 8 174Yb_HLA-DR 0.11111111 0.4444444
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(0, 0.0561621657336121, 0, 0), c(0, 0,
## 1.50456532271426, : cannot compute exact p-value with ties
## Protein p.value FDR_corrected
## 1 144Nd_HLA-ABC 0.6095238 0.9142857
## 2 154Sm_CD45 0.3523810 0.7928571
## 3 156Gd_Vimentin 0.2571429 0.7714286
## 4 159Tb_CD4 0.6095238 0.9142857
## 5 163Dy_TP63 0.7619048 0.9795918
## 6 166Er_CD44 0.1714286 0.7714286
## 7 169Tm_CD24 1.0000000 1.0000000
## 8 170Yb_CD3 0.2571429 0.7714286
## 9 174Yb_HLA-DR 0.9142857 1.0000000
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(3.30075534359615, 0.0972935332589382,
## 3.31354462816258, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1.62386202915046, 0.717296683223608,
## 2.44717234480325, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.748820669157824, 0.123253874437529,
## 1.67192224879611, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.63662474810693, 0.0913139040611759,
## 1.20841095112529, : cannot compute exact p-value with ties
## Protein p.value FDR_corrected
## 1 144Nd_HLA-ABC 0.1645218 0.9523810
## 2 145Nd_CD31 0.4541743 0.9523810
## 3 146Nd_Thioredoxin 0.5929097 0.9846154
## 4 154Sm_CD45 0.4761905 0.9523810
## 5 156Gd_Vimentin 0.4761905 0.9523810
## 6 158Gd_p-STAT3 0.3523810 0.9523810
## 7 159Tb_CD4 0.7461278 0.9846154
## 8 160Gd_MDM2 0.9142857 0.9846154
## 9 163Dy_TP63 1.0000000 1.0000000
## 10 166Er_CD44 0.9142857 0.9846154
## 11 171Yb_CD11b 0.4761905 0.9523810
## 12 172Yb_p-S6 0.9142857 0.9846154
## 13 174Yb_HLA-DR 0.2571429 0.9523810
## 14 175Lu_PD-L1 0.7619048 0.9846154
## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(2.33079293987923, 0.0582644008491126,
## 1.75087785796523, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1.63156525887895, 0.322305842850225,
## 1.32203451663395, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1.13653172069154, 0.0527562183492093,
## 1.50292979733778, : cannot compute exact p-value with ties
## Protein p.value FDR_corrected
## 1 141Pr_EpCAM 0.11428571 0.4114286
## 2 142Nd_ccasp3 0.35238095 0.5714286
## 3 144Nd_HLA-ABC 0.16452182 0.4408163
## 4 147Sm_b-CAT 0.17142857 0.4408163
## 5 148Nd_HER2 0.03809524 0.4114286
## 6 151Eu_TTF1 0.09898793 0.4114286
## 7 154Sm_CD45 0.47619048 0.5714286
## 8 155Gd_CD56 0.76190476 0.8067227
## 9 156Gd_Vimentin 0.47619048 0.5714286
## 10 158Gd_p-STAT3 0.35238095 0.5714286
## 11 160Gd_MDM2 0.91428571 0.9142857
## 12 161Dy_Cytokeratin 0.11428571 0.4114286
## 13 162Dy_MET 0.25714286 0.5142857
## 14 163Dy_TP63 0.47619048 0.5714286
## 15 164Dy_CK7 0.47619048 0.5714286
## 16 165Ho_EGFR 0.10550173 0.4114286
## 17 170Yb_CD3 0.60952381 0.6857143
## 18 174Yb_HLA-DR 0.25714286 0.5142857
ref_samples <- ref
# Check HLA-DR expression in controls
load("/Users/senosam/Documents/Massion_lab/CyTOF_summary/controls/annotated_controls.RData")
ref_clt <- ref[which(ref$CyTOF_date %in% ref_samples$CyTOF_date),]
annot_df_ctl <- annot_df_ctl[which(annot_df_ctl$CyTOF_date %in% ref_samples$CyTOF_date),]
ramos <- annot_df_ctl[which(annot_df_ctl$cell_line=='Ramos'),]
a549 <- annot_df_ctl[which(annot_df_ctl$cell_line=='A549'),]
ramos_median <- aggregate(denoisingCTF::t_asinh(ramos[,c(15, 17:31, 33:35, 37:48, 50:51)]), list(ramos[,'CyTOF_date']), median)
a549_median <- aggregate(denoisingCTF::t_asinh(a549[,c(15, 17:31, 33:35, 37:48, 50:51)]), list(a549[,'CyTOF_date']), median)
a549_median['Cell_line'] <- rep('A549', nrow(a549_median))
ramos_median['Cell_line'] <- rep('Ramos', nrow(ramos_median))
ctl_median <- rbind(a549_median, ramos_median)
ggplot(ctl_median, aes(x=Cell_line, y=`174Yb_HLA-DR`, fill = Cell_line)) +
geom_dotplot(binaxis='y', stackdir='center', dotsize=0.8) +
ylim(0,6)+
#facet_wrap(~variable) +
theme(plot.title = element_text(hjust = 0.5, size=22),
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
summary(ctl_median$`174Yb_HLA-DR`[1:4])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4604 0.7593 0.8666 0.8193 0.9267 1.0836
summary(ctl_median$`174Yb_HLA-DR`[5:8])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.186 3.244 4.329 3.726 4.812 5.059
umap_smp['subtype'] <- gsub('Epithelial', 'ECCc' ,umap_smp$subtype)
umap_smp$subtype <- as.factor(umap_smp$subtype)
umap_smp$subtype <- factor(umap_smp$subtype, levels = c(paste0("ECCc_", 1:10)))
#umap_smp$subtype4 <- factor(umap_smp$subtype4, levels = c("ECC", "Endothelial", "Mesenchymal", "Other_immune", "Myeloid", "Tc_cells", "Th_cells"))
## Warning: Removed 28 rows containing non-finite values (stat_density2d).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning in if (density) {: the condition has length > 1 and only the first
## element will be used
## Warning: Removed 2288 rows containing non-finite values (stat_density2d).
## Warning: Removed 2288 rows containing missing values (geom_point).
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
## Warning: Ignoring unknown parameters: label.size
## `geom_smooth()` using formula 'y ~ x'
## Warning: Ignoring unknown parameters: label.size
## `geom_smooth()` using formula 'y ~ x'
# xCell deconvolution TCGA data
xcell <- read.delim("https://xcell.ucsf.edu/xCell_TCGA_RSEM.txt", row.names = 1)
cn_xc <- sapply(strsplit(colnames(xcell), ".01$"), "[[", 1)
cn_xc <- gsub('\\.', '_', cn_xc)
colnames(xcell) <- cn_xc
xcell <- xcell[,which(colnames(xcell) %in% tcga_adc_clinical$submitter_id)]
label_by_gene <- function(normalized_data, gene, meta_data, extremes_only=FALSE, cond_name){
if(extremes_only){
data_t <- data.frame(t(normalized_data))
first_q <- quantile(data_t[,gene])[["25%"]]
third_q <- quantile(data_t[,gene])[["75%"]]
data_t$Condition <- "normal"
data_t$Condition[data_t[,gene] < first_q] <- "low"
data_t$Condition[data_t[,gene] > third_q] <- "high"
} else {
data_t <- data.frame(t(normalized_data))
mid_q <- quantile(data_t[,gene])[["50%"]]
data_t$Condition <- "low"
data_t$Condition[data_t[,gene] > mid_q] <- "high"
}
cn <- c(colnames(meta_data), cond_name)
meta_data <- cbind(meta_data, x=data_t$Condition)
meta_data$x <- as.factor(meta_data$x)
colnames(meta_data) <- cn
meta_data
}
# Labeling the data based on high vs low expression
hla_genes <- gsub('-', '.', rownames(dt_hla))
pdt <- tcga_adc_clinical
for(i in hla_genes){
pdt <- label_by_gene(log2(dt_hla+1), gene=i, meta_data = pdt, extremes_only=TRUE, cond_name=i)
}
# Selecting cell types of interest
xcell_ct <- rownames(xcell)[c(6,7,11:13)]
# Subsetting a df with the xcell data to be used
x <- t(xcell)
x_clin <- pdt[rownames(x),]
x <- x[,xcell_ct]
x <- cbind(x, x_clin[,70:ncol(x_clin)])
x <- na.omit(x)
x <- reshape2::melt(x)
## Using HLA.DRA, HLA.DRB5, HLA.DRB6, HLA.DRB1, HLA.DQA1, HLA.DQB1, HLA.DQA2, HLA.DQB2, HLA.DOB, HLA.DMB, HLA.DMA, HLA.DOA, HLA.DPA1, HLA.DPB1, HLA.DPB2 as id variables
# Generating summary table
sum_df <- data.frame(matrix(nrow = 0, ncol = 5))
for (i in colnames(x)[grep("H",colnames(x))]){
for (ii in unique(x$variable)) {
k <- which(x$variable == ii)
x_sub <- x[k,]
high_i <- which(x_sub[,i] == 'high')
low_i <- which(x_sub[,i] == 'low')
high_med <- median(x_sub[high_i,'value'])
low_med <- median(x_sub[low_i,'value'])
pval <- wilcox.test(x_sub[high_i,'value'], x_sub[low_i,'value'])
sum_df <- rbind(sum_df, c(i, ii, high_med, low_med, pval$p.value))
#cat(high_med, low_med, pval$p.value)
}
}
colnames(sum_df) <- c('Gene', 'Cell type', 'Median.High', 'Median.Low', 'p.value')
sum_df['p.adjusted'] <- p.adjust(sum_df$p.value, method = "BH")
#write.csv(sum_df, file = 'summary_xcellTCGA.csv')
violin_deconv <- function(x, ct, cluster_ID) {
k <- which(x$variable %in% ct)
x_sub <- x[k,c(cluster_ID, "variable", "value")]
k2 <- which(!x_sub[,cluster_ID] == 'normal')
x_sub <- x_sub[k2,]
ggplot(x_sub, aes_string(x=cluster_ID, y='value', fill=cluster_ID)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.1, fill="white") +
labs(title=NULL,x="Patient group", y = "Enrichment score") +
scale_fill_brewer(palette="Dark2",name = gsub('\\.', '-', cluster_ID)) +
ggsignif::geom_signif(comparisons = list(c("low", "high")),
map_signif_level=TRUE) +
facet_wrap(~variable, scales='free')+
theme_bw()+
ggtitle(gsub('\\.', '-', cluster_ID))+
theme(strip.text.x = element_text(size = 12),
axis.text.x =element_text(size = 10),
axis.text.y =element_text(size = 10),
axis.title=element_text(size=12),
plot.title = element_text(size=20, hjust = 0.5, face="italic"),
legend.position = "none")
}
violin_deconv(x, ct=xcell_ct[c(1,4)], cluster_ID = "HLA.DRA")
violin_deconv(x, ct=xcell_ct[c(1,4)], cluster_ID = "HLA.DRB1")